#!/usr/bin/perl -w
# Splits a dataset into a training and test dataset by moving the first rating
# of the user from the training set into a test set.
# The input data must have one line per rating record. Each rating record is
# "<userid> <itemid> [...]". It is assumed that the dataset is sorted by user.
# For each user the first record will be placed into a test set.
# Three datasets are generated:
# - train:    The same as the input dataset but the first record of each user is
#             removed.
# - test:     Contains the first record of each user. Data format is the same as
#             train.
# - test_neg: 100 negative items for each user. Has as many lines as the test
#             set. Format "(<userid> <itemid>) <negids>" where negids is a list
#             of item ids. The negative items are sampled from the set of all
#             items. For a given user, the negative items exclude all items
#             from <train> and <test> for that user.
# The data generated by this script follows the same data format as in the
# paper: He et al. Neural Collaborative filtering, WWW17. This script is used
# to create the holdout split for hyperparameter tuning.
# Note that the dataset in this paper is sorted in reverse time order, so the
# first record in the file is the record that appeared the latest in time.
#
# Example:
# ./create_hold_out.pl --in Data/ml-1m.train.rating \
#                      --out_train Data/ml-1m.holdout.train.rating \
#                      --out_test Data/ml-1m.holdout.test.rating \
#                      --out_test_neg Data/ml-1m.holdout.test.negative
use strict;
use warnings;
use Getopt::Long qw(GetOptions);

my ($flag_in, $flag_out_train, $flag_out_test, $flag_out_test_neg);
GetOptions( 'in:s' => \$flag_in,
            'out_train:s' => \$flag_out_train,
            'out_test:s' => \$flag_out_test,
            'out_test_neg:s' => \$flag_out_test_neg,
      ) or die;

# Get the number of items.
my $num_items = -1;
open my $IN, "<", $flag_in || die;
for (<$IN>) {
  chomp;
  my ($user, $item, @rest) = split /\s/;
  $item += 0;
  if ($item > $num_items) {
    $num_items = $item;
  }
}
close $IN;
$num_items += 1;
print "Number of items: $num_items\n";

# Write the results
open $IN, "<", $flag_in || die;
open my $OUT_TRAIN, ">", $flag_out_train || die;
open my $OUT_TEST, ">", $flag_out_test || die;
open my $OUT_TEST_NEG, ">", $flag_out_test_neg || die;
my $prev_user = -9999;
my $prev_user_test_item = -9999;
my %train_items;
for (<$IN>) {
  chomp;
  my $in_line = $_;
  my ($user, $item, @rest) = split /\s/;
  $user += 0;
  $item += 0;
  if ($prev_user != $user) {
    print {$OUT_TEST} $in_line, "\n";

    # Flush negs for previous user.
    if ($prev_user >= 0) {
      my %test_negs;
      while (scalar(keys(%test_negs)) < 100) {
        my $proposed_item = int(rand($num_items));
        if (($proposed_item != $prev_user_test_item) &&
            (!exists($train_items{$proposed_item})) &&
            (!exists($test_negs{$proposed_item}))) {
          $test_negs{$proposed_item} = 1;
        }
      }
      print {$OUT_TEST_NEG}
          "($prev_user, $prev_user_test_item)\t" .
          join("\t", keys(%test_negs)) . "\n";
    }
    $prev_user = $user;
    $prev_user_test_item = $item;
    %train_items = ();
  } else {
    print {$OUT_TRAIN} $in_line, "\n";
    $train_items{$item} = 1;
  }
}

# Flush negs for previous user.
if ($prev_user >= 0) {
  my %test_negs;
  while (scalar(keys(%test_negs)) < 100) {
    my $proposed_item = int(rand($num_items));
    if (($proposed_item != $prev_user_test_item) &&
        (!exists($train_items{$proposed_item})) &&
        (!exists($test_negs{$proposed_item}))) {
      $test_negs{$proposed_item} = 1;
    }
  }
  print {$OUT_TEST_NEG}
      "($prev_user, $prev_user_test_item)\t" .
      join("\t", keys(%test_negs)) . "\n";
}

close $OUT_TEST_NEG;
close $OUT_TEST;
close $OUT_TRAIN;
close $IN;

__END__
